Take Home Exercise 3

1. Introduction

This exercise will attempt to Question 1 of Mini Challenge 2 from Vast Challenge 2023. This exercise focuses on utilizing visual analytics to identify anomalies in the business groups within the knowledge graph.

2. Data Preparation

2.1 Installing Packages

In the code chunk below, necessary R packages are installed to import data, data preparation and visualization.

Show the code
pacman::p_load(jsonlite, tidygraph, ggraph, dplyr,
               visNetwork, graphlayouts, ggforce, knitr, kableExtra,
               skimr, tidytext, tidyverse, igraph, ggplot2, RColorBrewer, wordcloud)

2.2 Importing Data

In the code chunk below, fromJSON function is used to import MC3 data into R environment.

Show the code
mc3_data <- fromJSON("data/MC3.json")

2.3 Extracting Edges

The code chunk below is used to extract links from mc3_data and stored in the tibble data frame called “mc3_edges”.

Show the code
mc3_edges <- as_tibble(mc3_data$links) %>% 
  distinct() %>%
  mutate(source = as.character(source),
         target = as.character(target),
         type = as.character(type)) %>%
  group_by(source, target, type) %>%
    summarise(weights = n()) %>%
  filter(source!=target) %>%
  ungroup()

2.4 Extracting Nodes

The code chunk below is used to extract nodes from mc3_data and stored in the tibble data frame called “mc3_edges”.

Show the code
mc3_nodes <- as_tibble(mc3_data$nodes) %>%
  mutate(country = as.character(country),
         id = as.character(id),
         product_services = as.character(product_services),
         revenue_omu = as.numeric(as.character(revenue_omu)),
         type = as.character(type)) %>%
  select(id, country, type, revenue_omu, product_services)

2.4 Exploring Data

In the code chunk below, datatable() functions is used to display mc3_edges and mc3_nodes tibble data frame as an interactive table.

Show the code
DT :: datatable(mc3_edges,options = list(lengthMenu = c(3, 10, 20)))
Show the code
DT::datatable(mc3_nodes,options = list(lengthMenu = c(3, 10, 20)))

2.5 Text Sensing

In this code chunk, unnest_tokens function is used to split workds in “product_services” column.

Show the code
token_nodes <- mc3_nodes %>%
  unnest_tokens(word, 
                product_services)

2.6 Removing Stop Words

Tidytext package function called stop_words is used to remove stop words.

Show the code
stopwords_removed <- token_nodes %>% 
  anti_join(stop_words)

3. Network Visualization and Analysis

3.1 Building Network Model

In the code chunk below, a new node data table is prepared by using the source and target field of mc3_edges data frame.

Show the code
id1 <- mc3_edges %>%
  select(source) %>%
  rename(id = source)
id2 <- mc3_edges %>%
  select(target) %>%
  rename(id = target)
mc3_nodes1 <- rbind(id1, id2) %>%
  distinct() %>%
  left_join(mc3_nodes,
            unmatched = "drop")

In this code chunk, tidy graph data model is created.

Show the code
mc3_graph <- tbl_graph(nodes = mc3_nodes1,
                       edges = mc3_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         degree_centrality = centrality_degree())
mc3_df <- as.data.frame(mc3_graph, what = "both")

3.2 Centrality Measurements

Betweenness Centrality and Degree Centrality measurements are used to quantify each node’s interaction in the network and visualize the interactions. In the Code chunk below, mean and quartiles of betweenness centrality and degree centrality are calculated.

Show the code
# Calculate mean and quartiles
mean_betweenness <- mean(mc3_df$betweenness_centrality, na.rm = TRUE)
quartiles_betweenness <- quantile(mc3_df$betweenness_centrality, probs = c(0.5,0.9,0.95,0.99), na.rm = TRUE)
max_betweenness <- max(mc3_df$betweenness_centrality, na.rm = TRUE)

mean_degree <- mean(mc3_df$degree_centrality, na.rm = TRUE)
quartiles_degree <- quantile(mc3_df$degree_centrality, probs = c(0.50,0.90,0.95), na.rm = TRUE)
max_degree <- max(mc3_df$degree_centrality, na.rm = TRUE)

After that,the results are stored in the respective data frames.

Show the code
# Create a data frame for betweenness centrality
betweenness_df <- data.frame(
  Measure = c("Mean", "Quartile 0.50 (Median)", "Quartile 0.90", "Quartile 0.95", "Quartile 0.99", "Maximum"),
  Value = c(mean_betweenness, quartiles_betweenness, max_betweenness)
)

# Create a data frame for degree centrality
degree_df <- data.frame(
  Measure = c("Mean", "Quartile 0.50 (Median)", "Quartile 0.90","Quartile 0.95","Maximum"),
  Value = c(mean_degree, quartiles_degree,max_degree)
)

3.2.1 Mean and Quartiles of Centrality Measurement

Show the code
# Create the table for betweenness centrality
#| fig-height: 4
betweenness_df %>%
  kable() %>%
  kable_styling("hover", full_width = F) %>%
  row_spec(0, bold = T, color = "white", background = "#D7261E") %>%
  add_header_above(header = c("Betweenness Centrality Measurement" = ncol(betweenness_df)))
Betweenness Centrality Measurement
Measure Value
Mean 8393.619
Quartile 0.50 (Median) 0.000
Quartile 0.90 3.000
Quartile 0.95 176.400
Quartile 0.99 165050.000
Maximum 3849384.703
Show the code
# Create the table for degree centrality
#| fig-height: 4
degree_df %>%
  kbl() %>%
  kable_paper("hover", full_width = F)%>%
  row_spec(0, bold = T, color = "white", background = "#D7261E") %>%
  add_header_above(header = c("Degree Centrality Measurement" = ncol(degree_df)))
Degree Centrality Measurement
Measure Value
Mean 1.287965
Quartile 0.50 (Median) 1.000000
Quartile 0.90 2.000000
Quartile 0.95 3.000000
Maximum 120.000000

3.2.2 Betweenness Centrality

An interactive plot showing top few nodes and edges with highest betweenness_centrality will be created.

For this, in this code chunk, edges with betweenness_centrality value over 100,000 are filtered and stored in new data frame.

Show the code
# Filter edges based on betweenness centrality threshold
edges_top_betweenness <- mc3_graph %>%
  filter(betweenness_centrality > 100000) %>%
  activate(edges)%>%
  as.tibble()%>%
  select(from, to, type, weights)

In this code chunk, nodes with betweenness_centrality value over 100,000 are filtered and stored in new data frame.

Show the code
nodes_top_betweenness <- mc3_graph %>% filter(betweenness_centrality>100000)%>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label, type, country)

Then, an interactive network graph that shows interaction of business groups with betweenness centrality value higher than 100,000 is plotted. The nodes are colored according to their business type.

Show the code
nodes_top_betweenness <- nodes_top_betweenness %>% mutate(group = ifelse(is.na(type), "NA", type))
vis_plot <- visNetwork(nodes_top_betweenness, edges_top_betweenness,
           main = "Betweenness Centrality")%>%
  visIgraphLayout(layout = "layout_with_kk", smooth = FALSE,            
                  physics = TRUE            
                ) %>%
  visNodes(size = 50, label=nodes_top_betweenness$label) %>%
  visEdges(color = list(highlight = "lightgray"), arrows = 'to') %>%
  visOptions(selectedBy = "type",
             highlightNearest = list(enabled = TRUE,
                                     degree = 1,
                                     hover = TRUE,
                                     labelOnly = TRUE),
             nodesIdSelection = TRUE) %>%
  visGroups(groupname = "Company", color = "lightblue") %>%
  visGroups(groupname = "Company Contacts", color = "salmon") %>%
  visGroups(groupname = "Beneficial Owner", color = "yellow") %>%
  visGroups(groupname = "NA", color = "grey") %>%
  visLegend(width = 0.1) %>%
  visPhysics(repulsion = list(springlength = 50),
             maxVelocity = 2,
             solver = "forceAtlas2Based",
             forceAtlas2Based = list(gravitationalConstant = -1000),
             timestep = 0.25) %>%
  visLayout(randomSeed=4)
vis_plot

3.2.3 Degree Centrality

An interactive plot showing nodes and edges in top 95% percentile of degree_centrality will be created.

In this code chunk, edges with degree_centrality value greater than or equals to 3 are filtered and stored in new data frame.

Show the code
# Filter edges based on degree centrality threshold
edges_top_degree <- mc3_graph %>%
  filter(degree_centrality >= 3) %>%
  activate(edges)%>%
  as.tibble()%>%
  select(from, to, type, weights)

In this code chunk, nodes with degree_centrality value greater than or equals to 3 are filtered and stored in new data frame.

Show the code
# Filter nodes based on degree centrality threshold
nodes_top_degree <- mc3_graph %>%
  filter(degree_centrality >= 3) %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label, type, country)

Then, an interactive network graph that shows interaction of business groups with degree centrality greater than or equal to 3 is plotted. The nodes are colored according to their business type.

Show the code
nodes_top_degree <- nodes_top_degree %>% mutate(group = ifelse(is.na(type), "NA", type))
vis_plot <- visNetwork(nodes_top_degree, edges_top_degree,
           main = "Degree Centrality")%>%
  visIgraphLayout(layout = "layout_with_kk", smooth = FALSE,            
                  physics = TRUE            
                ) %>%
  visNodes(size = 50, label=nodes_top_degree$label) %>%
  visEdges(color = list(highlight = "lightgray"), arrows = 'to') %>%
  visOptions(selectedBy = "type",
             highlightNearest = list(enabled = TRUE,
                                     degree = 1,
                                     hover = TRUE,
                                     labelOnly = TRUE),
             nodesIdSelection = TRUE) %>%
  visGroups(groupname = "Company", color = "lightblue") %>%
  visGroups(groupname = "Company Contacts", color = "salmon") %>%
  visGroups(groupname = "Beneficial Owner", color = "yellow") %>%
  visGroups(groupname = "NA", color = "grey") %>%
  visLegend(width = 0.1) %>%
  visPhysics(repulsion = list(springlength = 50),
             maxVelocity = 2,
             solver = "forceAtlas2Based",
             forceAtlas2Based = list(gravitationalConstant = -1000),
             timestep = 0.25) %>%
  visLayout(randomSeed=4)
vis_plot

3.3 Company Ownership

Show the code
owner_count <- mc3_edges %>%
  filter(type == "Beneficial Owner") %>%
  group_by(target) %>%
  summarise(count = n()) %>%
  filter(count >2) %>%
  arrange(desc(count))

DT::datatable(owner_count,options = list(lengthMenu = c(3, 10, 20)))
Show the code
mean_counts <- mean(owner_count$count, na.rm = TRUE)
quartiles_counts <- quantile(owner_count$count, probs = c(0.5,0.95), na.rm = TRUE)

# Create a data frame for betweenness centrality
count_df <- data.frame(
  Measure = c("Mean", "Quartile 0.50 (Median)", "Quartile 0.95"),
  Value = c(mean_counts, quartiles_counts)
)

count_df %>%
  kbl() %>%
  kable_paper("hover", full_width = F) %>%
  row_spec(0, bold = T, color = "white", background = "#D7261E")
Measure Value
Mean 3.472574
50% Quartile 0.50 (Median) 3.000000
95% Quartile 0.95 5.000000
Show the code
target_list <- owner_count$target[owner_count$count >= 5] 
Show the code
top_owners_edges <- mc3_edges[mc3_edges$target %in% target_list, ]
Show the code
id1_owners <- top_owners_edges %>%
  select(source) %>%
  rename(id = source)
id2_owners <- top_owners_edges %>%
  select(target) %>%
  rename(id = target)
top_owners_nodes <- rbind(id1_owners, id2_owners) %>% 
  distinct() %>%
  left_join(mc3_nodes1, unmatched='drop')
Show the code
top_owners_graph <- tbl_graph(nodes = top_owners_nodes,
                       edges = top_owners_edges,
                       directed = FALSE) %>%
  mutate(betweenness_centrality = centrality_betweenness(),
         degree_centrality = centrality_degree())
Show the code
top_owners_edges_df <- top_owners_graph %>%
  activate(edges) %>%
  as_tibble()
top_owners_nodes_df <- top_owners_graph %>%
  activate(nodes) %>%
  as.tibble() %>%
  rename(label = id) %>%
  mutate(id=row_number()) %>%
  select(id, label, type, country)
Show the code
visNetwork(top_owners_nodes_df,
           top_owners_edges_df) %>%
  visIgraphLayout(layout = "layout_with_kk") %>%
  visOptions(highlightNearest = list(enabled = TRUE,
                                     degree = 1,
                                     hover = TRUE,
                                     labelOnly = TRUE),
             nodesIdSelection = TRUE) %>%
  visEdges(arrows = "to")
Show the code
top_owners_words <- left_join(select(top_owners_nodes_df,id,label),select( stopwords_removed,id,word), by = c("label"="id"), unmatched='drop')
Show the code
top_owners_words <- top_owners_words[!grepl("0|character|unknown|products", top_owners_words$word),]
Show the code
freq_word <- table(top_owners_words$word)

# Create the word cloud
wordcloud(names(freq_word), freq = freq_word, scale = c(4, 0.5), random.order = FALSE, colors = brewer.pal(8, "Dark2"))